####### The unreading project #######

#this project looks at citation practices in the Goethe Jahrbuch
#as a means of understanding the distribution of attention in Goethe criticism
#it is limited by only looking at the Jahrbuch and thus should be seen as a pilot project
#the data used here was generated using Jonathan Reeves' text reuse package on github:
#https://github.com/JonathanReeve/text-matcher

a3<-read.csv("Corpus_ngram3.csv")
p<-read.csv("Poetry_ngram3.csv")

#which works are the most cited?

a3.table<-sort(tapply(a3$Num.Matches, a3$Text.B, sum), decreasing = T)
a3.table[1:10]
write.csv(a3.table, file="TopCitedWorks.csv")

#which volumes are uncited?
m<-read.csv("GoetheCorpusFilenames.csv", header=F, stringsAsFactors = F)
m.non<-which(!as.character(m$V1) %in% as.character(a3$Text.B))
m2<-m[m.non,]

a3.non<-a3[which(!as.character(a3$Text.B) %in% as.character(m$V1)),]
unique(as.character(a3.non$Text.B))

#poetry
#for poetry count the # of volumes cited in due to 
#a single article being able to skew the numbers
p$count<-1
p.table<-sort(tapply(p$count, p$Text.B, sum), decreasing = T)
p.table[1:20]
write.csv(p.table, file="TopCitedWorksPoetry.csv")

#by genre
pg<-read.csv("TopCitedWorksPoetry_Genre.csv")
pg.table<-sort(tapply(pg$Count, pg$Genre, sum), decreasing = T)

#poetry uncited
p.files<-read.csv("GoethePoetryFilenames.csv", header=F, stringsAsFactors = F)
p.non<-which(!as.character(p.files$V1) %in% as.character(p$Text.B))
p2<-p.files[p.non,]

#plot
library(ggplot2)
a3.graph<-data.frame(a3.table)
a3.graph$works<-row.names(a3.graph)
a3.graph$index<-c(1:nrow(a3.graph))
colnames(a3.graph)<-c("count", "works", "index")
ggplot(a3.graph, aes(x = index, y = count)) + 
  theme_bw() +
  #theme(panel.grid.minor = element_blank(), panel.grid.major = element_blank(), legend.position="none") +
  geom_point(shape=1) +
  labs(x="Works", y="No. Quotations", title="Distribution of quotations across Goethe's corpus")

#calculate a Gini co-efficient for this data
library(ineq)
Gini(a3.graph$count)

#which genres are the most cited
library(splitstackshape)
library(stringr)
#first split on underscores to extract period information
a3.df<-cSplit(a3, 'Text.B', sep="_", type.convert=FALSE)
write.csv(a3.df, file="Corpus_ngram3_cleaned.csv")
#then clean and import
a3.df<-read.csv("Corpus_ngram3_cleaned.csv")

genre.t<-sort(tapply(a3.df$Num.Matches, a3.df$genre, sum), decreasing = T)
genre.t<-tapply(a3.df$Num.Matches, a3.df$genre, sum)
#genre.length<-tapply(a3.df$Text.B.Length, a3.df$genre, sum)
#genre.norm<-sort(genre.t/genre.length, decreasing = T)*100000

#normalize for length, i.e. are these rates commensurate with how many words each genre consumes?
library(tm)
corpus1 <- VCorpus(DirSource("GoetheWerkeCompletePeriod", encoding = "UTF-8"), readerControl=list(language="German"))
corpus1 <- tm_map(corpus1, content_transformer(stripWhitespace)) 
corpus1 <- tm_map(corpus1, content_transformer(tolower))
corpus1 <- tm_map(corpus1, content_transformer(removeNumbers))
#corpus1 <- tm_map(corpus1, removeWords, stopwords("German"))
corpus1 <- tm_map(corpus1, content_transformer(removePunctuation))
corpus1.dtm<-DocumentTermMatrix(corpus1, control=list(wordLengths=c(1,Inf)))
corpus1.matrix<-as.matrix(corpus1.dtm, stringsAsFactors=F)
#get word counts by doc
wc<-rowSums(corpus1.matrix)
#subset by genres
wc.poetry<-wc[grep("Poetry.txt", names(wc))]
wc.drama<-wc[grep("Drama.txt", names(wc))]
wc.prose<-wc[grep("Prose.txt", names(wc))]
wc.science<-wc[grep("naturalscience", names(wc))]
wc.critical<-wc[grep("Critical", names(wc), ignore.case = T)]

(genre.t[1]/sum(wc.critical))*100000
(genre.t[2]/sum(wc.drama))*100000
(genre.t[3]/sum(wc.science))*100000
(genre.t[4]/sum(wc.poetry))*100000
(genre.t[5]/sum(wc.prose))*100000

#which periods are the most cited
period.t<-sort(tapply(a3.df$Num.Matches, a3.df$Period, sum), decreasing = T)

#remove critical and scientific writing
a3.sub<-a3.df[!a3.df$genre %in% c("critical", "naturalscience"),]
period.t2<-sort(tapply(a3.sub$Num.Matches, a3.sub$Period, sum), decreasing = T)

#normalize by # words
wc.early<-wc[grep("Early_", names(wc))]
wc.mid<-wc[grep("Middle_", names(wc))]
wc.late<-wc[grep("Late_", names(wc))]

(period.t2[1]/sum(wc.late))*100000
(period.t2[2]/sum(wc.early))*100000
(period.t2[3]/sum(wc.mid))*100000

#### comparing the language of cited works to uncited works #####

#ingest list of cited poetry
p<-read.csv("Poetry_ngram3.csv")

#ingest list of all poetry
p.all<-read.csv("GoethePoetryFilenames.csv", header=F)

#subset by cited poetry and non-cited poetry
p.non<-which(!as.character(p.all$V1) %in% as.character(p$Text.B))
p.non<-as.character(p.all[p.non,])
p.cite<-which(as.character(p.all$V1) %in% as.character(p$Text.B))
p.cite<-as.character(p.all[p.cite,])

#ingest poetry corpus
library(tm)
corpus1 <- VCorpus(DirSource("GoethePoetryReducedMinusLarge", encoding = "UTF-8"), readerControl=list(language="German"))
corpus1 <- tm_map(corpus1, content_transformer(stripWhitespace)) 
corpus1 <- tm_map(corpus1, content_transformer(tolower))
corpus1 <- tm_map(corpus1, content_transformer(removeNumbers))
#corpus1 <- tm_map(corpus1, removeWords, stopwords("German"))
corpus1 <- tm_map(corpus1, content_transformer(removePunctuation))
corpus1.dtm<-DocumentTermMatrix(corpus1, control=list(wordLengths=c(1,Inf)))
corpus1.matrix<-as.matrix(corpus1.dtm, stringsAsFactors=F)
#keep top 5K words
top<-sort(colSums(corpus1.matrix), decreasing = T)[1:5000]
#keep only words that appear more than 50 times
top<-top[top>50]
corpus1.matrix<-corpus1.matrix[,colnames(corpus1.matrix) %in% names(top)]

#subset by cited and non
c.non<-corpus1.matrix[row.names(corpus1.matrix) %in% p.non,]
c.cite<-corpus1.matrix[row.names(corpus1.matrix) %in% p.cite,]

#find distinctive words of each subcorpus
H = function(k) {N = sum(k); return(sum(k/N*log(k/N+(k==0))))}
word1<-colSums(c.non)
word2<-colSums(c.cite)
all1<-sum(word1)
all2<-sum(word2)
results <- data.frame(word = colnames(c.non), 
                      group1=word1,
                      group2=word2,
                      G2 = 0,
                      fisher.OR = 0,
                      fisher.p = 0)
for (j in 1:ncol(c.non)){
  print(j)
  cont.table<-data.frame(c(word1[j], all1-word1[j]), c(word2[j], all2-word2[j]))
  fish<-fisher.test(cont.table)
  LLR = 2*sum(cont.table)*(H(cont.table)-H(rowSums(cont.table))-H(colSums(cont.table)))
  results$G2[j] = LLR
  results$fisher.OR[j] = fish$estimate
  results$fisher.p[j] = fish$p.value
}
#sort by G2
dunning.df<-results[order(-results$G2),]

#the above ranks by strength either over or under expected values
#if you want to sort by above and below, then run the following code
dunning.sort<-dunning.df
dunning.sort$diff<-dunning.sort$group1-dunning.sort$group2
G2_Sort.v<-vector()
for (i in 1:nrow(dunning.sort)){
  if (dunning.sort$fisher.OR[i] <= 1){
    G2_Sort<--dunning.sort$G2[i]
  } else {
    G2_Sort<-dunning.sort$G2[i]
  }
  G2_Sort.v<-append(G2_Sort.v, G2_Sort)
}
dunning.sort<-cbind(dunning.sort, G2_Sort.v)
dunning.sort<-dunning.sort[order(-dunning.sort$G2_Sort.v),]

#keep below significance threshold
dunning.sort<-dunning.sort[dunning.sort$fisher.p < 0.05,]
write.csv(dunning.sort, file="TopWords.csv")

### Run Topic Model to generalize about word patterns
library(topicmodels)
library(tm)
corpus1 <- VCorpus(DirSource("GoethePoetryReducedMinusLarge", encoding = "UTF-8"), readerControl=list(language="German"))
corpus1 <- tm_map(corpus1, content_transformer(stripWhitespace)) 
corpus1 <- tm_map(corpus1, content_transformer(tolower))
corpus1 <- tm_map(corpus1, content_transformer(removeNumbers))
corpus1 <- tm_map(corpus1, removeWords, stopwords("German"))
corpus1 <- tm_map(corpus1, content_transformer(removePunctuation))
#corpus1 <- tm_map(corpus1, stemDocument, language = "german")
corpus1.dtm<-DocumentTermMatrix(corpus1, control=list(wordLengths=c(1,Inf)))
corpus1.matrix<-as.matrix(corpus1.dtm, stringsAsFactors=F)
#remove problems
corpus1.matrix<-corpus1.matrix[,!colnames(corpus1.matrix) %in% c("s", "seite", "apparat", "sei", "hast", "wär", "schon", "wäre", "hätt")]
#keep top 3K words
top<-sort(colSums(corpus1.matrix), decreasing = T)[1:3000]
corpus1.matrix<-corpus1.matrix[,which(colnames(corpus1.matrix) %in% names(top))]
#run topic model on k topics
k=20
control_LDA_Gibbs<-list(alpha=(50/k), estimate.beta=TRUE, iter=1000, burnin=20, best=TRUE, seed=2)
topicmodel<-LDA(corpus1.matrix, method="Gibbs", k=k, control = control_LDA_Gibbs) # k = # topics
term_dis<-terms(topicmodel, 20) 

#then run comparison per topic to discover which topics have the strongest difference between
#the corpora
#establish word counts for each
c.non<-corpus1.matrix[row.names(corpus1.matrix) %in% p.non,]
c.cite<-corpus1.matrix[row.names(corpus1.matrix) %in% p.cite,]
non.wc<-rowSums(c.non)
cite.wc<-rowSums(c.cite)
#establish bootstrap function
boot.mean<-function(data, num){
  resamples<-lapply(1:num, function(i) sample(data, replace=T))
  r.mean<-sapply(resamples, mean)
  return(r.mean)
}


topic.df<-NULL
for (i in 1:k){
  topic.no<-i
  #get list of your topic words
  topic.words<-term_dis[,topic.no]
  #subset your original DTM by your word list
  sub<-corpus1.matrix[,colnames(corpus1.matrix) %in% topic.words]
  #sum all your topic word counts
  sub.df<-data.frame(rowSums(sub))
  sub.a<-sub.df[row.names(sub.df) %in% p.non,]/unname(non.wc)
  sub.b<-sub.df[row.names(sub.df) %in% p.cite,]/unname(cite.wc)
  # bootstrap function
  a<-boot.mean(sub.a, 1000)
  b<-boot.mean(sub.b, 1000)
  #run a t-test to compare means of group a v. b for a given topic
  mean.non<-mean(a)
  mean.cite<-mean(b)
  mod<-t.test(a,b)
  p.value<-mod$p.value[1]
  topic<-i
  ratio<-mean.non/mean.cite
  temp.df<-data.frame(topic, mean.non, mean.cite, ratio,p.value)
  topic.df<-rbind(topic.df, temp.df)
}

write.csv(topic.df, file="Topics.csv")

library(wordcloud)
topic<-read.csv("Topics.csv")
topic<-topic[topic$ratio > 1,]
wordcloud(as.character(topic$topic.label), topic$ratio^2, min.freq=0, random.order = F, rot.per=0)







